Chapter 9 Functional Analysis

9.1 Functional overview

n_genes <- genome_annotations %>%
  group_by(genome) %>%
  summarize(n_genes = n())

n_genes
# A tibble: 93 × 2
   genome           n_genes
   <chr>              <int>
 1 EHA00531_bin.1      4139
 2 EHA00535_bin.6      3426
 3 EHA00539_bin.6      3839
 4 EHA00726_bin.13     3950
 5 EHA00928_bin.90     3569
 6 EHA01202_bin.66     3139
 7 EHA01281_bin.18     3922
 8 EHA01439_bin.44     3572
 9 EHA01634_bin.14     3731
10 EHA01845_bin.103    3752
# ℹ 83 more rows

9.1.0.1 Predicted genes

pred_genes <- genome_annotations %>%
  nrow()

cat(pred_genes)
371999

9.1.0.2 Number of annotated genes and percentages

#How many genes have at least 1 annotation
genome_annota <- genome_annotations %>%
  filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
  nrow()

cat(genome_annota)
311777
#Percentage of predicted genes with at least 1 annotation
genome_annota*100/pred_genes
[1] 83.81125

9.1.0.3 Number of KEGG annotatated genes and percentages

# KEGG annotation
kegg_annota <- genome_annotations %>%
  filter(!is.na(kegg)) %>%
  nrow()
cat(kegg_annota)
243658
# KEGG annotation percentage
kegg_annota*100/genome_annota
[1] 78.15137
# AMR annotation
amr_annota <- genome_annotations %>%
  filter(resistance_type == "AMR") %>%
  nrow()
cat(amr_annota)
26372
# AMR annotation percentage
amr_annota*100/genome_annota
[1] 8.45861
# VF annotation
vf_annota <- genome_annotations %>%
  filter(!is.na(vf)) %>%
  nrow()
cat(vf_annota)
70648
# VF annotation percentage
vf_annota*100/genome_annota
[1] 22.65979
n_pred_genes <- genome_annotations %>%
  group_by(mag_id) %>%
  summarize(n_genes = n()) %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, source),
    by = c("mag_id" = "ID")
  )

annotated_genes <- genome_annotations %>%
  filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
  group_by(mag_id) %>%
  summarize(n_annotated_genes = n()) %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, source),
    by = c("mag_id" = "ID")
  )
ggplot(n_pred_genes, aes(x= source, y = n_genes, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_boxplot()+ 
  geom_point()+
  theme_classic()+
  labs(y = "Gene Number", x = "Source")

ggplot(annotated_genes, aes(x= source, y = n_annotated_genes, fill = source))+
  scale_fill_manual(values = source_colors)+
  geom_boxplot()+ 
  geom_point()+
  theme_classic()+
  labs(y = "Annotated Gene Number", x = "Source")

wilcox.test(n_genes ~ source, data=n_pred_genes)  %>%
  tidy()
# A tibble: 1 × 4
  statistic   p.value method                                            alternative
      <dbl>     <dbl> <chr>                                             <chr>      
1      394. 0.0000776 Wilcoxon rank sum test with continuity correction two.sided  
wilcox.test(n_annotated_genes ~ source, data=annotated_genes)  %>%
  tidy()
# A tibble: 1 × 4
  statistic   p.value method                                            alternative
      <dbl>     <dbl> <chr>                                             <chr>      
1      350. 0.0000153 Wilcoxon rank sum test with continuity correction two.sided  

9.2 KEGG

9.2.1 Nº of MAGs with KOs

#Proportion of MAGs belonging to each group per KEGG
#I want to know for example 5% of MAGs in the EHI group have this KEGG, but 40 % of MAGs in the GTDB group have it

# KEGG presence/absence
kegg_presence <- genome_annotations %>%
  filter(mag_id != "GCA_015060925.1") %>%   # remove weird MAG
  filter(!is.na(kegg)) %>%
  dplyr::select(mag_id, kegg) %>%
  distinct() 

#Add the source info
kegg_with_source <- kegg_presence %>%
  left_join(
    genome_metadata %>% dplyr::select(ID, source),
    by = c("mag_id" = "ID")
  )

# Count how many MAGs in each KEGG
kegg_mag_counts <- kegg_with_source %>%
  group_by(source, kegg) %>%
  summarise(
    n_mags = n(),
    .groups = "drop"
  )

#Count total MAGs per source (except the outlier)
total_mags_per_source <- genome_metadata %>%
  filter(ID != "GCA_015060925.1") %>%
  group_by(source) %>%
  summarise(
    total_mags = n_distinct(ID),
    .groups = "drop"
  )

#Calculate proportions of MAGs from each source in each KEGG
kegg_mag_proportions <- kegg_mag_counts %>%
  left_join(total_mags_per_source, by = "source") %>%
  mutate(
    proportion = n_mags / total_mags,
    absent = total_mags - n_mags
  )

9.2.1.1 Plotting KEGG MAG proportions

kegg_mag_proportions %>%
  filter(n_mags >= 20)
# A tibble: 3,068 × 6
   source kegg   n_mags total_mags proportion absent
   <chr>  <chr>   <int>      <int>      <dbl>  <int>
 1 EHI    K00010     25         25       1         0
 2 EHI    K00012     24         25       0.96      1
 3 EHI    K00013     25         25       1         0
 4 EHI    K00014     22         25       0.88      3
 5 EHI    K00015     24         25       0.96      1
 6 EHI    K00018     22         25       0.88      3
 7 EHI    K00024     25         25       1         0
 8 EHI    K00027     23         25       0.92      2
 9 EHI    K00030     24         25       0.96      1
10 EHI    K00033     25         25       1         0
# ℹ 3,058 more rows
ggplot(kegg_mag_proportions,
       aes(x = kegg, y = proportion, fill = source)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = source_colors) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  labs(
    x = "KEGG ortholog",
    y = "Proportion of MAGs"
  )

9.2.1.2 Statistical testing of MAG proportions

fisher_results <- kegg_mag_proportions %>%
  dplyr::select(kegg, source, n_mags, absent) %>%
  pivot_wider(
    names_from = source,
    values_from = c(n_mags, absent),
    values_fill = 0
  ) %>%
  rowwise() %>%
  mutate(
    p_value = fisher.test(
      matrix(
        c(n_mags_EHI,  absent_EHI,
          n_mags_GTDB, absent_GTDB),
        nrow = 2,
        byrow = TRUE
      )
    )$p.value
  ) %>%
  ungroup() %>%
  mutate(p_adj = p.adjust(p_value, method = "BH"))

fisher_results <- fisher_results %>%
  mutate(
    prop_EHI  = n_mags_EHI  / (n_mags_EHI  + absent_EHI),
    prop_GTDB = n_mags_GTDB / (n_mags_GTDB + absent_GTDB),
    diff_prop = prop_GTDB - prop_EHI
  )
sig_fisher_results <- fisher_results%>% filter(p_adj < 0.05) 
sig_fisher_results
# A tibble: 4 × 10
  kegg   n_mags_EHI n_mags_GTDB absent_EHI absent_GTDB      p_value    p_adj prop_EHI prop_GTDB diff_prop
  <chr>       <int>       <int>      <int>       <int>        <dbl>    <dbl>    <dbl>     <dbl>     <dbl>
1 K03205          3          40         22          28 0.0000486    0.0281       0.12     0.588     0.468
2 K06926          2          38         23          30 0.0000318    0.0245       0.08     0.559     0.479
3 K13525         14          68         11           0 0.0000000731 0.000169     0.56     1         0.44 
4 K20331         13          64         12           4 0.0000118    0.0137       0.52     0.941     0.421
library(KEGGREST)

# Function to get info from KEGG
get_kegg_info <- function(ko_id) {
  query <- keggGet(ko_id)[[1]]
  
  # Extract Name and Definition
  name <- ifelse(!is.null(query$NAME), query$NAME, "N/A")
  definition <- ifelse(!is.null(query$DEFINITION), query$DEFINITION, "N/A")
  
  # Extract Pathways (often multiple)
  pathways <- if(!is.null(query$PATHWAY)) {
    paste(names(query$PATHWAY), query$PATHWAY, sep = ": ", collapse = "; ")
  } else {
    "No pathway assigned"
  }
  
  return(c(KO = ko_id, Name = name, Definition = definition, Pathways = pathways))
}
signigicant_kos <- fisher_results%>% filter(p_adj < 0.05) %>% pull(kegg)


ko_list <- lapply(signigicant_kos, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_list))

print(ko_summary)
      KO                                                Name Definition
1 K03205 type IV secretion system protein VirD4 [EC:7.4.2.8]        N/A
2 K06926                             uncharacterized protein        N/A
3 K13525           transitional endoplasmic reticulum ATPase        N/A
4 K20331                  toxoflavin synthase [EC:2.1.1.349]        N/A
                                                                                                                                                                                                            Pathways
1                                                                                                                                                                               map03070: Bacterial secretion system
2                                                                                                                                                                                                No pathway assigned
3 map04137: Mitophagy - animal; map04141: Protein processing in endoplasmic reticulum; map05014: Amyotrophic lateral sclerosis; map05022: Pathways of neurodegeneration - multiple diseases; map05134: Legionellosis
4                                                                                                                                                                                           map02024: Quorum sensing
fish_results_names <- left_join(sig_fisher_results, ko_summary, join_by(kegg== KO))


heatmap_data <- fish_results_names %>%
  dplyr::select(Name, prop_EHI, prop_GTDB)%>%
  pivot_longer(!Name, names_to = "source", values_to= "proportion" ) %>%
   mutate(
    source = case_when(
      source == "prop_EHI"  ~ "EHI",
      source == "prop_GTDB" ~ "GTDB",
      TRUE ~ source
    )
  )
ggplot(heatmap_data,
       aes(x = source, y = Name, fill = proportion)) +
  geom_tile() +
  scale_fill_viridis_c(
    name = "Proportion of MAGs",
    limits = c(0, 1)
  ) +
  theme_minimal() +
  labs(
    x = "Source",
    y = "KEGG ortholog",
    title = "Differential KEGG presence between EHI and GTDB"
  )

9.2.2 Functional Ordination

PCA of KEGG annotations:

kegg_counts <- genome_annotations %>%
  filter(mag_id != "GCA_015060925.1")%>% #this is the smaller mag that's weird
  filter(!is.na(kegg)) %>%          
  dplyr::count(mag_id, kegg) %>%           
  pivot_wider(
    names_from = kegg,
    values_from = n,
    values_fill = 0
  )

# Normalization
kegg_rel <- kegg_counts %>%
  column_to_rownames("mag_id")
kegg_rel <- kegg_rel / rowSums(kegg_rel)  # Each row sums to 1

# Remove zero variance
kegg_rel_nz <- kegg_rel[, apply(kegg_rel, 2, sd) > 0]

# PCA with scaling
pca <- prcomp(kegg_rel_nz, scale. = TRUE)

# Check variance explained
summary(pca)$importance[,1:5]
                            PC1     PC2     PC3      PC4      PC5
Standard deviation     17.94329 9.61480 8.85902 8.735097 8.450299
Proportion of Variance  0.13926 0.03998 0.03395 0.033000 0.030890
Cumulative Proportion   0.13926 0.17924 0.21319 0.246190 0.277080
country_palette <- c(
  # Southern Europe
  "Spain" = "#1B9E77",
  "Italy" = "#33A02C",
  "Greece" = "#66C2A5",
  "Portugal" = "#2CA25F",
  "Malta" = "#99D8C9",

  # Northern/Central Europe
  "Germany" = "#1F78B4",
  "United Kingdom" = "#4A90E2",
  "Ireland" = "#6BAED6",

  # East Asia 
  "Japan" = "#E31A1C",
  "South Korea" = "#FB6A4A",
  "China" = "#CB181D",

  # North America 
  "USA" = "#756BB1",
  "Canada" = "#9E9AC8",

  # Distinct
  "Australia" = "#FFD92F",   
  "Greenland" = "#A6CEE3",

  "none" = "grey70"
)
scores <- as.data.frame(pca$x)
scores$ID <- rownames(scores)

scores <- scores %>%
  left_join(genome_metadata, by = "ID")

ggplot(scores, aes(PC1, PC2, color = source)) +
  geom_point(size = 2) +
   scale_color_manual(values = source_colors)+
  theme_minimal() +
  labs(
    title = "PCA of KEGG annotations across MAGs",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores, aes(PC2, PC3, color = source)) +
   scale_color_manual(values = source_colors)+
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "P2 vs PC3 of KEGG annotations across MAGs",
    x = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)"),
    y = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)")
  )

ggplot(scores, aes(PC3, PC4, color = source)) +
   scale_color_manual(values = source_colors)+
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "P3 vs PC4 of KEGG annotations across MAGs",
    x = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)"),
    y = paste0("PC4 (", round(summary(pca)$importance[2,4] * 100, 1), "%)")
  )

ggplot(scores, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by genome size",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by completeness",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores, aes(PC1, PC2, color = host_species)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by host species ",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores, aes(PC1, PC2, color = country)) +
  geom_point(size = 2) +
  theme_minimal() +
  scale_color_manual(values = country_palette, na.value = "grey70")+
  labs(
    title = "KEGG PCA colored by country ",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

ggplot(scores, aes(PC1, PC2, color = continent)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "KEGG PCA colored by continent",
    x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
  )

loadings <- pca$rotation

abs_loadings <- apply(abs(loadings[, 1:2]), 1, sum)
top_combined <- sort(abs_loadings, decreasing = TRUE)[1:10]
top_combined
    K00428     K13663     K19783     K12035     K03310     K19234     K01442     K07012     K15971     K02574 
0.09395610 0.09123926 0.08962148 0.08809128 0.08623159 0.08587434 0.08425436 0.08356048 0.08308556 0.08301173 
library(KEGGREST)

top_loadings <- loadings %>%
  as.data.frame() %>%
  rownames_to_column("KEGG") %>%
  arrange(desc(abs(PC1))) 

head(top_loadings)
    KEGG        PC1          PC2          PC3         PC4        PC5         PC6         PC7          PC8          PC9
1 K00318 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
2 K00603 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
3 K00602 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
4 K00241 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
5 K00760 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
6 K00761 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
         PC10         PC11         PC12         PC13        PC14        PC15        PC16        PC17         PC18
1 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
2 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
3 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
4 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
5 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
6 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
         PC19         PC20         PC21          PC22        PC23        PC24         PC25         PC26         PC27
1 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
2 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
3 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
4 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
5 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
6 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
           PC28        PC29         PC30         PC31          PC32        PC33         PC34          PC35         PC36
1 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
2 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
3 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
4 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
5 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
6 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
          PC37        PC38        PC39          PC40          PC41         PC42         PC43          PC44          PC45
1 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
2 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
3 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
4 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
5 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
6 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
           PC46         PC47         PC48        PC49          PC50         PC51        PC52          PC53          PC54
1 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
2 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
3 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
4 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
5 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
6 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
         PC55         PC56        PC57        PC58         PC59        PC60         PC61        PC62       PC63
1 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
2 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
3 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
4 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
5 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
6 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
           PC64         PC65        PC66         PC67          PC68        PC69        PC70        PC71         PC72
1 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
2 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
3 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
4 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
5 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
6 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
          PC73         PC74         PC75        PC76         PC77        PC78        PC79          PC80        PC81
1 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
2 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
3 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
4 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
5 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
6 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
         PC82          PC83          PC84        PC85        PC86         PC87          PC88        PC89         PC90
1 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
2 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
3 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
4 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
5 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
6 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
          PC91         PC92         PC93
1 -0.003479609 0.0005672297 -0.060196326
2 -0.003479609 0.0005672297 -0.028827514
3 -0.003479609 0.0005672297 -0.015508586
4 -0.003479609 0.0005672297 -0.007955012
5 -0.003479609 0.0005672297  0.005988844
6 -0.003479609 0.0005672297  0.005988844
top_kos <- head(top_loadings)%>%pull(KEGG)

ko_pathways <- lapply(top_kos, function(k) {
  info <- keggGet(paste0("ko:", k))[[1]]
  pathways <- info$PATHWAY
  if(is.null(pathways)) pathways <- NA
  return(pathways)
})

names(ko_pathways) <- top_kos
ko_pathways
$K00318
                               map00330                                map01100                                map01110 
      "Arginine and proline metabolism"                    "Metabolic pathways" "Biosynthesis of secondary metabolites" 

$K00603
                   map00340                    map00670                    map01100 
     "Histidine metabolism" "One carbon pool by folate"        "Metabolic pathways" 

$K00602
                               map00230                                map00670                                map01100 
                    "Purine metabolism"             "One carbon pool by folate"                    "Metabolic pathways" 
                               map01110                                map01523 
"Biosynthesis of secondary metabolites"                 "Antifolate resistance" 

$K00241
                                      map00020                                       map00190 
                   "Citrate cycle (TCA cycle)"                    "Oxidative phosphorylation" 
                                      map00650                                       map00720 
                        "Butanoate metabolism"               "Other carbon fixation pathways" 
                                      map01100                                       map01110 
                          "Metabolic pathways"        "Biosynthesis of secondary metabolites" 
                                      map01120                                       map01200 
"Microbial metabolism in diverse environments"                            "Carbon metabolism" 

$K00760
                               map00230                                map00983                                map01100 
                    "Purine metabolism"       "Drug metabolism - other enzymes"                    "Metabolic pathways" 
                               map01110                                map01232 
"Biosynthesis of secondary metabolites"                 "Nucleotide metabolism" 

$K00761
               map00240                map01100                map01232 
"Pyrimidine metabolism"    "Metabolic pathways" "Nucleotide metabolism" 

9.2.3 KEGG heatmap

# Relative abundance heatmap
pheatmap(kegg_rel_nz,
  color = viridis(100, option = "viridis"),
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  fontsize = 9,
  border_color = NA
)

# For scaled heatmap -> remove zero variance columns 
col_vars <- apply(kegg_rel_nz, 2, var, na.rm = TRUE)
matrix_filtered <- kegg_rel_nz[, col_vars > 0 & !is.na(col_vars)]

# Scale the filtered matrix
scaled <- scale(matrix_filtered, center = TRUE, scale = TRUE)

# Check for any remaining Inf/NaN values
if(any(!is.finite(scaled))) {
  scaled[!is.finite(scaled)] <- 0  # Replace Inf/NaN with 0
}

# Scaled heatmap
pheatmap(scaled,
  color = viridis(100, option = "viridis"),
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  fontsize = 5,
  border_color = NA
)

9.2.4 Testing the differences in KEGG annotations with PERMANOVA

# Distance matrix from normalized data
kegg_dist <- vegdist(kegg_rel_nz, method = "bray") 

# Add metadata
kegg_rel_nz_meta <- kegg_rel_nz %>%
  rownames_to_column("ID") %>%
  left_join(genome_metadata, by = "ID")

#Beta dispersion test
dispersion <- betadisper(kegg_dist, kegg_rel_nz_meta$source)
anova(dispersion) 
Analysis of Variance Table

Response: Distances
          Df   Sum Sq   Mean Sq F value   Pr(>F)   
Groups     1 0.005554 0.0055536  9.9304 0.002201 **
Residuals 91 0.050891 0.0005592                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test
permanova_result <- adonis2(kegg_dist ~ genome_size + source, 
                            data = kegg_rel_nz_meta, 
                            permutations = 999)
print(permanova_result)
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = kegg_dist ~ genome_size + source, data = kegg_rel_nz_meta, permutations = 999)
         Df SumOfSqs     R2      F Pr(>F)    
Model     2  0.20587 0.2843 17.876  0.001 ***
Residual 90  0.51825 0.7157                  
Total    92  0.72412 1.0000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(
  kegg_dist ~ genome_size + source,
  data = kegg_rel_nz_meta,
  permutations = 999,
  by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

adonis2(formula = kegg_dist ~ genome_size + source, data = kegg_rel_nz_meta, permutations = 999, by = "margin")
            Df SumOfSqs      R2       F Pr(>F)    
genome_size  1  0.14688 0.20284 25.5069  0.001 ***
source       1  0.01024 0.01415  1.7789  0.049 *  
Residual    90  0.51825 0.71570                   
Total       92  0.72412 1.00000                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kegg_counts_mat <- kegg_counts %>%
  column_to_rownames("mag_id")

kegg_pa <- (kegg_counts_mat > 0) * 1


# remove zero-variance KOs
kegg_pa_nz <- kegg_pa[, colSums(kegg_pa) > 0 & colSums(kegg_pa) < nrow(kegg_pa)]

meta <- genome_metadata %>%
  filter(ID %in% rownames(kegg_pa_nz)) %>%
  column_to_rownames("ID")

# VERY IMPORTANT: enforce same order
meta <- meta[rownames(kegg_pa_nz), ]

stopifnot(identical(rownames(meta), rownames(kegg_pa_nz)))


kegg_dist_pa <- vegdist(kegg_pa_nz, method = "jaccard", binary = TRUE)

# dispersion check
disp_pa <- betadisper(kegg_dist_pa, meta$source)
anova(disp_pa)
Analysis of Variance Table

Response: Distances
          Df   Sum Sq   Mean Sq F value    Pr(>F)    
Groups     1 0.027469 0.0274695  17.117 7.827e-05 ***
Residuals 91 0.146036 0.0016048                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA
adonis2(
  kegg_dist_pa ~ genome_size +completeness+ source,
  data = meta,
  permutations = 999, by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

adonis2(formula = kegg_dist_pa ~ genome_size + completeness + source, data = meta, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)    
genome_size   1  0.09249 0.04627 4.6819  0.001 ***
completeness  1  0.01738 0.00870 0.8798  0.611    
source        1  0.04227 0.02115 2.1400  0.009 ** 
Residual     89  1.75813 0.87964                  
Total        92  1.99870 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Identify rows that have NO missing values
keep_indices <- complete.cases(meta[, c("continent", "genome_size", "completeness", "source")])

# Filter the metadata
meta_clean <- meta[keep_indices, ]


kegg_dist_mat <- as.matrix(kegg_dist_pa)
kegg_dist_clean <- as.dist(kegg_dist_mat[keep_indices, keep_indices])


adonis2(
  kegg_dist_clean ~ country + genome_size + completeness,
  data = meta_clean,
  permutations = 999, 
  by = "margin"
)
Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999

adonis2(formula = kegg_dist_clean ~ country + genome_size + completeness, data = meta_clean, permutations = 999, by = "margin")
             Df SumOfSqs      R2      F Pr(>F)    
country      14  0.43481 0.28606 1.8155  0.001 ***
genome_size   1  0.04218 0.02775 2.4659  0.004 ** 
completeness  1  0.01180 0.00776 0.6896  0.835    
Residual     53  0.90666 0.59647                  
Total        69  1.52004 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcoa_pa <- cmdscale(kegg_dist_pa, eig = TRUE, k = 2)

pcoa_df <- data.frame(
  ID = rownames(kegg_pa_nz),
  PC1 = pcoa_pa$points[,1],
  PC2 = pcoa_pa$points[,2],
  source = meta$source
) %>%
  left_join(genome_metadata, by = "ID")


ggplot(pcoa_df, aes(PC1, PC2, color = source.x)) +
  geom_point(size = 3, alpha = 0.8) +
  theme_classic()

ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
  geom_point(size = 3, alpha = 0.8) +
  theme_classic()

ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
  geom_point(size = 3, alpha = 0.8) +
  theme_classic()

9.2.5 Principal Coordinate Analysis - KEGG

pcoa_res <- cmdscale(kegg_dist, k = 2, eig = TRUE)

# Calculate variance explained 
variance_explained <- pcoa_res$eig / sum(pcoa_res$eig)

pcoa_df <- data.frame(
  PC1 = pcoa_res$points[,1], 
  PC2 = pcoa_res$points[,2], 
  ID = rownames(kegg_rel_nz)
) %>%
  left_join(genome_metadata, by = "ID")


ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
  scale_color_manual(values = source_colors)+
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of KEGG annotations across MAGs",
    x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of KEGG annotations across MAGs",
    x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  )

ggplot(pcoa_df, aes(PC1, PC2, color = host_species)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of KEGG annotations across MAGs (colored by host species)",
    x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  )

 ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
   geom_point(size = 2) +
   theme_minimal() +
   labs(
   title = "PCoA of KEGG annotations across MAGs (colored by country)",
     x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
     y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
   )

 ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
   geom_point(size = 2) +
   theme_minimal() +
   labs(
     title = "PCoA of KEGG annotations across MAGs (colored by continent)",
     x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
     y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
   )

library(patchwork)
p_main <- ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
  scale_color_manual(values = source_colors) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "PCoA of KEGG annotations across MAGs",
    x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
  ) +
  theme(
    legend.position = "right",
    axis.title.x = element_blank(),  
    axis.text.x = element_blank(),   
    axis.ticks.x = element_blank(),  
    plot.margin = margin(5, 5, 0, 5)
  )

cor_value <- cor(pcoa_df$PC1, pcoa_df$completeness, method = "spearman")
cor_pvalue <- cor.test(pcoa_df$PC1, pcoa_df$completeness, 
                       method = "spearman")$p.value
Warning in cor.test.default(pcoa_df$PC1, pcoa_df$completeness, method = "spearman"): Cannot compute exact p-value with
ties
cor_text <- paste0("Spearman ρ = ", round(cor_value, 3), 
                  "\np < ", format.pval(cor_pvalue, digits = 2))

p_completeness_annotated <- ggplot(pcoa_df, aes(x = PC1, y = completeness, 
                                                 color = source)) +
  scale_color_manual(values = source_colors) +
  geom_point(alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 1) +
  annotate("text", 
           x = min(pcoa_df$PC1) + 0.1 * diff(range(pcoa_df$PC1)), 
           y = max(pcoa_df$completeness) - 5,
           label = cor_text, 
           hjust = 0, 
           size = 3,  # Changed from 4 to 3 (smaller)
           fontface = "plain") +  # Changed from "bold" to "plain"
  theme_minimal() +
  labs(
    y = "Completeness (%)",
    x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)")
  ) +
  theme(
    legend.position = "none",
    plot.margin = margin(0, 5, 5, 5),
    panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)  # Add border
  )


p_main <- p_main + 
  theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))

combined_annotated <- p_main / p_completeness_annotated + 
  plot_layout(heights = c(3, 1), guides = "collect")

print(combined_annotated)
`geom_smooth()` using formula = 'y ~ x'

9.2.6 Differential abundance

Differential expression analysis- trying to find which KEGG orthologs are significantly different between EHI vs GTDB DESeq2

library(DESeq2)
Cargando paquete requerido: S4Vectors
Cargando paquete requerido: stats4
Cargando paquete requerido: BiocGenerics

Adjuntando el paquete: 'BiocGenerics'
The following object is masked from 'package:gridExtra':

    combine
The following object is masked from 'package:tinytable':

    colnames
The following objects are masked from 'package:lubridate':

    intersect, setdiff, union
The following objects are masked from 'package:dplyr':

    combine, intersect, setdiff, union
The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated,
    eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff,
    table, tapply, union, unique, unsplit, which.max, which.min

Adjuntando el paquete: 'S4Vectors'
The following objects are masked from 'package:Matrix':

    expand, unname
The following object is masked from 'package:ggtree':

    expand
The following objects are masked from 'package:lubridate':

    second, second<-
The following objects are masked from 'package:dplyr':

    first, rename
The following object is masked from 'package:tidyr':

    expand
The following object is masked from 'package:utils':

    findMatches
The following objects are masked from 'package:base':

    expand.grid, I, unname
Cargando paquete requerido: IRanges

Adjuntando el paquete: 'IRanges'
The following object is masked from 'package:webshot':

    resize
The following object is masked from 'package:rstatix':

    desc
The following object is masked from 'package:nlme':

    collapse
The following object is masked from 'package:ggtree':

    collapse
The following object is masked from 'package:phyloseq':

    distance
The following object is masked from 'package:lubridate':

    %within%
The following objects are masked from 'package:dplyr':

    collapse, desc, slice
The following object is masked from 'package:purrr':

    reduce
The following object is masked from 'package:R.oo':

    trim
The following object is masked from 'package:grDevices':

    windows
Cargando paquete requerido: GenomicRanges
Cargando paquete requerido: GenomeInfoDb
Cargando paquete requerido: SummarizedExperiment
Cargando paquete requerido: MatrixGenerics
Cargando paquete requerido: matrixStats
Warning: package 'matrixStats' was built under R version 4.4.3

Adjuntando el paquete: 'matrixStats'
The following object is masked from 'package:dplyr':

    count

Adjuntando el paquete: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins,
    colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, colMads, colMaxs,
    colMeans2, colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges, colRanks, colSdDiffs,
    colSds, colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads, colWeightedMeans,
    colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs, rowIQRDiffs,
    rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins, rowOrderStats,
    rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs,
    rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars
Cargando paquete requerido: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Adjuntando el paquete: 'Biobase'
The following object is masked from 'package:MatrixGenerics':

    rowMedians
The following objects are masked from 'package:matrixStats':

    anyMissing, rowMedians
The following object is masked from 'package:phyloseq':

    sampleNames
# Prepare the count matrix (KOs as rows, MAGs as columns)
# Ensure only integer columns are kept
counts_mtx <- kegg_counts %>%
  column_to_rownames("mag_id") %>%
  t() 

# Prepare metadata (Must match column names of counts_mtx)
metadata <- genome_metadata %>%
  filter(ID %in% colnames(counts_mtx)) %>%
  column_to_rownames("ID")

# Ensure order matches exactly
metadata <- metadata[colnames(counts_mtx), , drop = FALSE]

#Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts_mtx,
                              colData = metadata,
                              design = ~ source)
Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are characters, converting to
factors
# Run the differential analysis
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
# save results (Wildlife vs Human)
res <- results(dds, contrast=c("source", "EHI", "GTDB"), alpha=0.05)


summary(res)

out of 2312 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 0, 0%
LFC < 0 (down)     : 10, 0.43%
outliers [1]       : 0, 0%
low counts [2]     : 82, 3.5%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
res_df <-res %>% as.data.frame %>% 
  rownames_to_column("KO") %>% 
  as_tibble
ggplot(res_df,aes(x=baseMean, y=log2FoldChange)) + 
  geom_point(alpha=0.8) + 
  geom_smooth() + 
  scale_x_log10() + 
  geom_hline(yintercept = 0, alpha = 0.75,
  color="red")+
  theme_bw()+ coord_cartesian(ylim=c(-10, 10))
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

res_df_2<- res_df%>% 
  mutate(label2= ifelse (abs(log2FoldChange) > 0.58 ,KO, ""))

ggplot(res_df_2, aes(x=baseMean, 
                 y=log2FoldChange, 
                 col=padj<0.05, label=label2)) + 
  geom_point(alpha=0.5) + 
  scale_x_log10() + 
  geom_hline(yintercept = 0, alpha = 0.75,
  color="red")+
  geom_text_repel()+
  theme_bw()+ coord_cartesian(ylim=c(-10, 10))
Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider increasing max.overlaps

ggplot(res_df_2, aes(x=log2FoldChange, y=-log10(pvalue), color=padj < 0.05, label=label2)) + 
  geom_point(alpha=0.5) +
  geom_vline(xintercept = 1, alpha = 0.75, 
  linetype="dashed")+
  geom_vline(xintercept = -1, alpha = 0.75, 
  linetype="dashed")+
   geom_text_repel() +
  theme_bw()
Warning: ggrepel: 89 unlabeled data points (too many overlaps). Consider increasing max.overlaps

9.2.7 Wilcoxon

#Transform to long format and run test for every KO
wilcox_results <- kegg_rel_nz_meta %>%
  pivot_longer(cols = starts_with("K"), 
               names_to = "KO", 
               values_to = "rel_abundance") %>%
  group_by(KO) %>%
  do(tidy(wilcox.test(rel_abundance ~ source, data = .))) %>%
  ungroup()

#  Adjust p-values
wilcox_results <- wilcox_results %>%
  mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
  filter(p.adj < 0.05) %>%
  arrange(p.adj)

head(wilcox_results)
# A tibble: 6 × 6
  KO     statistic      p.value method                                            alternative     p.adj
  <chr>      <dbl>        <dbl> <chr>                                             <chr>           <dbl>
1 K08138     1483  0.0000000422 Wilcoxon rank sum test with continuity correction two.sided   0.0000976
2 K00013     1331  0.0000313    Wilcoxon rank sum test with continuity correction two.sided   0.000189 
3 K00024     1339  0.0000230    Wilcoxon rank sum test with continuity correction two.sided   0.000189 
4 K00033     1325  0.0000392    Wilcoxon rank sum test with continuity correction two.sided   0.000189 
5 K00052     1329  0.0000337    Wilcoxon rank sum test with continuity correction two.sided   0.000189 
6 K00053     1344. 0.0000186    Wilcoxon rank sum test with continuity correction two.sided   0.000189 
res_df <- as.data.frame(res) %>%
  rownames_to_column("KO") %>%
  filter(padj < 0.05) %>%
  arrange(padj)

# Find KOs that are significant in BOTH tests
common_KOs <- intersect(res_df$KO, wilcox_results$KO)

# Boxplot of the top hit
top_ko <- common_KOs[1]

ggplot(kegg_rel_nz_meta, aes(x = source, y = .data[[top_ko]], fill = source)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  theme_minimal() +
  labs(title = paste("Distribution of", top_ko),
       y = "Relative Abundance")

# Filter data to only include the common KOs
plot_data <- kegg_rel_nz_meta %>%
  dplyr::select(ID, source, all_of(common_KOs)) %>%
  pivot_longer(cols = all_of(common_KOs), 
               names_to = "KO", 
               values_to = "Relative_Abundance")


ggplot(plot_data, aes(x = source, y = Relative_Abundance, fill = source)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.4, size = 1) +
  facet_wrap(~KO, scales = "free_y", ncol = 5) + # 'free_y' is important as abundance scales vary
  scale_fill_manual(values = source_colors) +
  theme_minimal() +
  theme(strip.text = element_text(face = "bold"),
        legend.position = "bottom") +
  labs(title = "Relative Abundance of Consensus Significant KOs",
       x = "Host Source",
       y = "Relative Abundance")

Check alignment with DESeq:

# Calculate medians for each group to see direction
direction_check <- kegg_rel_nz_meta %>%
  dplyr::select(source, all_of(common_KOs)) %>%
  pivot_longer(-source, names_to = "KO", values_to = "val") %>%
  group_by(KO, source) %>%
  summarize(median_val = median(val), .groups = 'drop') %>%
  pivot_wider(names_from = source, values_from = median_val) %>%
  mutate(Enriched_In = ifelse(GTDB > EHI, "GTDB", "EHI"))

# Merge with your DESeq2 results to see if they align
final_comparison <- res_df %>%
  filter(KO %in% common_KOs) %>%
  dplyr::select(KO, log2FoldChange, padj) %>%
  left_join(direction_check, by = "KO")

print(final_comparison)
       KO log2FoldChange         padj          EHI         GTDB Enriched_In
1  K00754     -0.7479886 7.909738e-05 0.0031446541 0.0056385200        GTDB
2  K12063     -2.8303552 2.317878e-04 0.0000000000 0.0007237285        GTDB
3  K03496     -1.5234980 8.727123e-04 0.0004004806 0.0007883328        GTDB
4  K04763     -0.5721473 8.727123e-04 0.0072086504 0.0113583996        GTDB
5  K03205     -2.7791700 8.727123e-04 0.0000000000 0.0003911803        GTDB
6  K02316     -0.8913735 1.094241e-02 0.0008285004 0.0015585457        GTDB
7  K01185     -2.0946452 4.321013e-02 0.0000000000 0.0003613384        GTDB
8  K16695     -0.8284503 4.811695e-02 0.0008009612 0.0014506351        GTDB
9  K14059     -1.0656248 4.916919e-02 0.0003895598 0.0010485845        GTDB
10 K00986     -1.7702579 4.916919e-02 0.0000000000 0.0003677157        GTDB
target_kos <- common_KOs 



ko_info_list <- lapply(target_kos, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_info_list))

print(ko_summary)
       KO                                                Name Definition                             Pathways
1  K00754           L-malate glycosyltransferase [EC:2.4.1.-]        N/A                  No pathway assigned
2  K12063          conjugal transfer ATP-binding protein TraC        N/A                  No pathway assigned
3  K03496                     chromosome partitioning protein        N/A                  No pathway assigned
4  K04763                          integrase/recombinase XerD        N/A                  No pathway assigned
5  K03205 type IV secretion system protein VirD4 [EC:7.4.2.8]        N/A map03070: Bacterial secretion system
6  K02316                          DNA primase [EC:2.7.7.101]        N/A            map03030: DNA replication
7  K01185                              lysozyme [EC:3.2.1.17]        N/A                  No pathway assigned
8  K16695                         lipopolysaccharide exporter        N/A                  No pathway assigned
9  K14059                                           integrase        N/A                  No pathway assigned
10 K00986           RNA-directed DNA polymerase [EC:2.7.7.49]        N/A                  No pathway assigned
#Prepare the matrix (KOs as rows, MAGs as columns)
heatmap_mat <- kegg_rel_nz_meta %>%
  column_to_rownames("ID") %>%
  dplyr::select(all_of(common_KOs)) %>%
  as.matrix() %>%
  t()

#  Prepare the annotation data frame
annotation_col <- data.frame(Source = kegg_rel_nz_meta$source)
rownames(annotation_col) <- kegg_rel_nz_meta$ID

# Check if they match
all(colnames(heatmap_mat) == rownames(annotation_col))
[1] TRUE
#colors
ann_colors <- list(
  Source = source_colors
)

# Plot
pheatmap(heatmap_mat, 
         annotation_col = annotation_col, 
         annotation_colors = ann_colors,
         scale = "row",          
         show_colnames = TRUE, 
         cluster_cols = TRUE,    
         main = "Consensus KOs: Human vs Wildlife")

#Prepare the matrix (KOs as rows, MAGs as columns)
heatmap_mat <- kegg_rel_nz_meta %>%
  column_to_rownames("ID") %>%
  dplyr::select(all_of(wilcox_results$KO)) %>%
  as.matrix() %>%
  t()

#  Prepare the annotation data frame
annotation_col <- data.frame(Source = kegg_rel_nz_meta$source)
rownames(annotation_col) <- kegg_rel_nz_meta$ID

# Check if they match
all(colnames(heatmap_mat) == rownames(annotation_col))
[1] TRUE
#colors
ann_colors <- list(
  Source = source_colors
)

# Plot
pheatmap(heatmap_mat, 
         annotation_col = annotation_col, 
         annotation_colors = ann_colors,
         scale = "row",          
         show_colnames = TRUE, 
         cluster_cols = TRUE,    
         main = "Consensus KOs: Human vs Wildlife")

9.2.8 ALDEX2

library(ALDEx2)
kegg_raw_mat <- kegg_counts %>%
  column_to_rownames("mag_id") %>%
  as.matrix()

kegg_raw_mat[is.na(kegg_raw_mat)] <- 0


conds <- kegg_rel_nz_meta %>%
  filter(ID %in% rownames(kegg_raw_mat)) %>%
  arrange(match(ID, rownames(kegg_raw_mat))) %>%
  pull(source)

table(conds)  
conds
 EHI GTDB 
  25   68 
aldex_clr <- aldex.clr(
  reads = t(kegg_raw_mat),
  conds = conds,
  mc.samples = 128,   # number of Monte Carlo instances
  denom = "all",      
  verbose = TRUE
)

# Welch's t-test and Wilcoxon test
aldex_test   <- aldex.ttest(aldex_clr)
aldex_effect <- aldex.effect(aldex_clr)

# Combine results
aldex_res <- cbind(aldex_test, aldex_effect)

sig_kegg <- aldex_res %>%
  as.data.frame() %>%
  rownames_to_column("KEGG") %>%
  filter(we.eBH < 0.05) %>%  # FDR-corrected
  arrange(we.eBH)

# Add direction of enrichment
sig_kegg <- sig_kegg %>%
  mutate(enriched_in = ifelse(effect > 0, "GTDB", "EHI"))

sig_kegg
 [1] KEGG         we.ep        we.eBH       wi.ep        wi.eBH       rab.all      rab.win.EHI  rab.win.GTDB diff.btw    
[10] diff.win     effect       overlap      enriched_in 
<0 rows> (o 0- extensión row.names)

No significant results…

GENOME SIZE is significantly larger in GTDB MAGs –> correct for genome size

Which functions are enriched per unit genome by source?

genome_sizes <- genome_metadata %>%
  dplyr::select(ID, genome_size)

kegg_counts_gs <- kegg_counts %>%
  left_join(genome_sizes, by = c("mag_id" = "ID")) %>%
  column_to_rownames("mag_id")

# Divide by genome size (in Mb)
kegg_per_mb <- kegg_counts_gs[, -ncol(kegg_counts_gs)] /
  (kegg_counts_gs$genome_size / 1e6)
kegg_mat <- as.matrix(kegg_per_mb)

# Replace any NA
kegg_mat[is.na(kegg_mat)] <- 0

#Filter rare keggs 
keep_kegg <- colSums(kegg_mat > 0) >= 0.1 * nrow(kegg_mat)
kegg_mat <- kegg_mat[, keep_kegg]

#add a pseudocount
kegg_mat <- kegg_mat + 1e-6

conds <- kegg_rel_nz_meta %>%
  filter(ID %in% rownames(kegg_mat)) %>%
  arrange(match(ID, rownames(kegg_mat))) %>%
  pull(source)

table(conds)
conds
 EHI GTDB 
  25   68 
scale_factor <- 1e4

kegg_pseudo <- round(kegg_per_mb * scale_factor)

# Replace zeros (ALDEx2 hates structural zeros)
kegg_pseudo[kegg_pseudo == 0] <- 1
aldex_clr <- aldex.clr(
  reads = t(kegg_pseudo),
  conds = conds,
  mc.samples = 128,
  denom = "all",
  verbose = TRUE
)

aldex_test   <- aldex.ttest(aldex_clr)
aldex_effect <- aldex.effect(aldex_clr)

aldex_res <- cbind(aldex_test, aldex_effect)

head(aldex_res)
              we.ep      we.eBH        wi.ep       wi.eBH  rab.all rab.win.EHI rab.win.GTDB    diff.btw  diff.win
K00010 0.0001063773 0.002512091 2.530797e-05 0.0001483258 7.224545    7.494664     7.090477 -0.37785129 0.4583032
K00012 0.6501288038 1.000000000 7.406534e-01 0.9197889601 5.248405    5.183798     5.315146 -0.02973917 0.8881005
K00013 0.0003129409 0.002713804 2.310542e-05 0.0001494921 3.175763    3.474600     3.095719 -0.35789699 0.5068449
K00014 0.1795102174 1.000000000 8.466026e-03 0.0159037771 3.147306    3.409503     3.095182 -0.24028803 0.5772888
K00015 0.6819146389 0.995012247 2.074072e-04 0.0005719904 3.154343    3.442476     3.086002 -0.32476494 0.5168753
K00018 0.1811793431 1.000000000 1.098270e-02 0.0203090202 3.148386    3.385027     3.093653 -0.22941452 0.5806484
            effect   overlap
K00010 -0.76745096 0.2092442
K00012 -0.01246316 0.4875000
K00013 -0.63201501 0.2081250
K00014 -0.11844158 0.3306250
K00015 -0.48066477 0.2437500
K00018 -0.11295111 0.3287500
sig_kegg <- aldex_res %>%
  as.data.frame() %>%
  rownames_to_column("KEGG") %>%
  filter(
    we.eBH < 0.05,
    abs(effect) > 1
  ) %>%
  arrange(desc(abs(effect)))

sig_kegg
    KEGG        we.ep      we.eBH        wi.ep       wi.eBH  rab.all rab.win.EHI rab.win.GTDB   diff.btw  diff.win
1 K08138 1.487155e-05 0.002512091 1.922896e-08 4.445735e-05 3.227229    4.113634     3.109252 -0.8298336 0.8244235
     effect overlap
1 -1.022404 0.11625
library(compositions)

clr_mat <- clr(kegg_per_mb + 1e-6)

res <- apply(clr_mat, 2, function(x) {
  wilcox.test(x ~ conds)$p.value
})

res_adj <- p.adjust(res, method = "BH")
clr_res <- data.frame(
  KEGG = names(res),
  p_value = res,
  p_adj = res_adj
) %>%
  arrange(p_adj)

head(clr_res)
         KEGG      p_value        p_adj
K08138 K08138 2.581324e-08 5.968022e-05
K00010 K00010 2.051803e-05 1.062537e-04
K00013 K00013 2.394559e-05 1.062537e-04
K00024 K00024 2.216885e-05 1.062537e-04
K00033 K00033 2.899917e-05 1.062537e-04
K00052 K00052 2.394559e-05 1.062537e-04
sum(clr_res$p_adj < 0.05)
[1] 2011
res_lm <- apply(clr_mat, 2, function(x) {
  model <- lm(x ~ conds + kegg_counts_gs$genome_size)
  summary(model)$coefficients["condsGTDB","Pr(>|t|)"]
})

res_lm_adj <- p.adjust(res_lm, method = "BH")
sum(res_lm_adj < 0.05)
[1] 3
clr_res_lm <- data.frame(
  KEGG = colnames(clr_mat),
  p_value = res_lm,
  p_adj = res_lm_adj
)

sig_kegg <- clr_res_lm %>%
  filter(p_adj < 0.05) %>%
  arrange(p_adj)


sig_kegg$effect <- sapply(sig_kegg$KEGG, function(k) {
  median(clr_mat[, k][conds == "GTDB"]) -
    median(clr_mat[, k][conds == "EHI"])
})

sig_kegg <- sig_kegg %>%
  arrange(desc(abs(effect)))

sig_kegg
         KEGG      p_value        p_adj       effect
K00856 K00856 1.904299e-05 1.467579e-02 -0.436244787
K13530 K13530 1.886011e-05 1.467579e-02 -0.436244787
K13525 K13525 3.445901e-09 7.966923e-06  0.008659409

9.3 GIFTs

PCoA with Euclidean distances:

# Convert the GIFTs into a matrix of functional elements per genome (row)
gift_pcoa <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    vegdist(method="euclidean") %>%
    pcoa() #principal component analysis

#Extract eigenvalues (variance explained by first 10 axes)
gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]


# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
  as.data.frame() %>% 
 dplyr::select(Axis.1,Axis.2) # keep the first 2 axes


gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]


#For the black arrows: Functional group loadings
# covariance between each functional trait and pcoa axis scores
#scale with the eigenvectors
gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*%
  diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
  as.data.frame() %>% 
  dplyr::rename(Axis.1=1,Axis.2=2) %>% 
  rownames_to_column(var="label") %>% 
  #get function summary vectors
  mutate(func=substr(label,1,3)) %>% 
  group_by(func) %>% #grouping by function
  summarise(Axis.1=mean(Axis.1),
            Axis.2=mean(Axis.2)) %>% 
  dplyr::rename(label=func) %>% 
  filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>% 
  rownames_to_column(var="ID") %>% 
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      scale_color_manual(values=source_colors)+
      geom_point(aes(x=Axis.1,y=Axis.2, color = source), 
                 alpha=0.9, shape=16) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) + 
    ylim(-1,1.5) +
    theme_minimal() + 
  
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps

gift_pcoa_vectors %>% 
  rownames_to_column(var="ID") %>% 
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      geom_point(aes(x=Axis.1,y=Axis.2, color = completeness), 
                 alpha=0.9, shape=16) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) + 
    ylim(-1,1.5) +
    theme_minimal() + 
  
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps

gift_pcoa_vectors %>% 
  rownames_to_column(var="ID") %>% 
  left_join(genome_metadata, by="ID") %>%
  ggplot() +
      #genome positions
      geom_point(aes(x=Axis.1,y=Axis.2, color = continent), 
                 alpha=0.9, shape=16) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-0.8,1) + 
    ylim(-1,1.5) +
    theme_minimal() + 
  
  labs(
    x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
    y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
  )
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps

Using k-means to cluster the groups and check which MAGs cluster together:

coords <- gift_pcoa_vectors %>%
  rownames_to_column("MAG") %>%
  as_tibble()

set.seed(123)
km <- kmeans(coords[, c("Axis.1", "Axis.2")], centers = 4, nstart = 25, iter.max = 100)

coords <- coords %>% mutate(cluster = factor(km$cluster))
# centroids as a tibble for plotting
centroids <- as_tibble(km$centers) %>% mutate(cluster = factor(1:nrow(km$centers)))

ggplot(coords, aes(x = Axis.1, y = Axis.2, color = cluster)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_point(data = centroids, aes(x = Axis.1, y = Axis.2, color = cluster),
             shape = 4, size = 5, stroke = 1.25) +   # X marks centroids
  theme_minimal() +
  labs(title = paste0("PCoA colored by kmeans (k=4)"),
       color = "cluster")

mags_by_cluster <- split(coords$MAG, km$cluster)
mags_by_cluster
$`1`
 [1] "GCA_958435695.1" "EHM070114"       "EHM049446"       "GCF_001405935.1" "GCA_030844705.1" "EHM016582"      
 [7] "GCA_030839725.1" "EHM075512"       "EHM048917"       "GCF_001406015.1" "GCA_958447325.1" "GCA_958416015.1"
[13] "GCA_030839525.1" "EHM039583"      

$`2`
 [1] "GCA_022784765.1" "EHM049769"       "GCF_003459965.1" "EHM069263"       "EHM016228"       "GCA_958416885.1"
 [7] "GCA_008680675.1" "GCA_958404515.1" "GCA_030843875.1" "EHM020917"       "GCA_022776965.1" "GCA_959028115.1"
[13] "GCA_009774835.1" "GCA_958413795.1" "GCA_959023275.1" "GCF_003474765.1" "GCF_029369685.1" "GCA_008679655.1"

$`3`
 [1] "GCA_020809955.1" "GCA_014870645.1" "GCA_030842705.1" "GCA_959606485.1" "EHM027637"       "GCF_012843465.1"
 [7] "GCF_003469715.1" "GCF_003471825.1" "EHM046460"       "GCF_003469775.1" "GCA_958422645.1" "GCA_022774455.1"
[13] "GCA_030842445.1" "GCA_958440685.1" "EHM016991"       "GCF_018784195.1" "EHM067538"       "GCF_001405775.1"
[19] "GCA_030844325.1" "GCA_025757725.1" "GCA_959600655.1" "GCA_958432605.1" "GCF_003437495.1" "GCF_003472045.1"
[25] "EHM064868"       "GCF_020687325.1" "GCF_003468915.1" "GCF_003467575.1" "GCF_018292145.1" "GCF_003469235.1"
[31] "GCA_008679285.1" "GCA_958427815.1" "EHM047216"       "GCA_958445425.1" "EHM023585"       "GCF_002206325.1"
[37] "GCF_003468605.1" "GCA_958371025.1" "GCF_018288975.1" "EHM031743"       "GCA_030841945.1" "GCA_959026965.1"
[43] "GCA_022771305.1" "GCA_003464145.1"

$`4`
 [1] "EHM072417"       "EHM029564"       "GCF_019733675.1" "EHM049533"       "GCF_018784125.1" "GCF_022835355.1"
 [7] "GCF_003475725.1" "EHM028668"       "GCF_018587995.1" "GCA_959026225.1" "GCF_022835335.1" "EHM048790"      
[13] "EHM023781"       "GCF_003464475.1" "GCF_018289335.1" "EHM016318"       "GCF_003439415.1"

9.3.1 Checking each cluster at once

cluster1_MAGs <- mags_by_cluster[[1]]
genome_metadata %>% filter(ID %in% cluster1_MAGs)
# A tibble: 14 × 25
   ID              species completeness contamination genome_size    GC    N50 contigs host_species host_order host_class
   <chr>           <chr>          <dbl>         <dbl>       <dbl> <dbl>  <dbl>   <dbl> <chr>        <chr>      <chr>     
 1 EHM016582       s__Par…         92.2          1.17     3706610  45.7   7507     618 Strix aluco  Strigifor… Aves      
 2 EHM039583       s__Par…         98.5          1.78     4472794  45    29681     243 Sciurus vul… Rodentia   Mammalia  
 3 EHM048917       s__Par…         94.6          1.35     3837800  46    12164     428 Lepus europ… Lagomorpha Mammalia  
 4 EHM049446       s__Par…         90.1          0.6      3591722  46    12732     405 Lepus europ… Lagomorpha Mammalia  
 5 EHM070114       s__Par…        100.0          2.39     4728690  45    63727     110 Rattus ratt… Rodentia   Mammalia  
 6 EHM075512       s__Par…        100.0          2.34     4690486  45    64909     112 Rattus ratt… Rodentia   Mammalia  
 7 GCA_958416015.1 Paraba…         92.4          0.18     3902302  46.4   9412     497 <NA>         <NA>       <NA>      
 8 GCA_958447325.1 Paraba…         99.4          0.61     4709435  45.0 118754      66 <NA>         <NA>       <NA>      
 9 GCA_030844705.1 Paraba…         97.7          0.85     4586771  45.3 134207      49 <NA>         <NA>       <NA>      
10 GCA_030839525.1 Paraba…         97.9          1.68     4512812  45.1  15103     417 <NA>         <NA>       <NA>      
11 GCA_030839725.1 Paraba…         92.7          1.74     4293293  44.8  29168     238 <NA>         <NA>       <NA>      
12 GCA_958435695.1 Paraba…         99.4          1.06     4635291  45.0 106588      71 <NA>         <NA>       <NA>      
13 GCF_001405935.1 Paraba…         99.4          1.5      5099398  45.0 277147      45 <NA>         <NA>       <NA>      
14 GCF_001406015.1 Paraba…         99.4          0.62     5073478  NA       NA      NA <NA>         <NA>       <NA>      
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
#   country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
#   mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
cluster2_MAGs <- mags_by_cluster[[2]]
genome_metadata %>% filter(ID %in% cluster2_MAGs)
# A tibble: 18 × 25
   ID              species completeness contamination genome_size    GC    N50 contigs host_species host_order host_class
   <chr>           <chr>          <dbl>         <dbl>       <dbl> <dbl>  <dbl>   <dbl> <chr>        <chr>      <chr>     
 1 EHM016228       s__Par…         97.3          0.92     4815605  45   3.27e4     224 Strix aluco  Strigifor… Aves      
 2 EHM020917       s__Par…         90.3          1.73     3472909  45.6 1.19e4     414 Strix aluco  Strigifor… Aves      
 3 EHM049769       s__Par…         95.2          1.01     4794833  45   6.88e4     107 Podarcis ga… Squamata   Reptilia  
 4 EHM069263       s__Par…        100            0.74     5112462  45   1.46e5      97 Rattus ratt… Rodentia   Mammalia  
 5 GCA_008680675.1 Paraba…         99.4          0.69     4962458  45.2 1.06e6       6 <NA>         <NA>       <NA>      
 6 GCA_022776965.1 Paraba…         97.3          0.26     4589485  45.3 1.64e5      44 <NA>         <NA>       <NA>      
 7 GCA_958413795.1 Paraba…         99.4          0.77     4948642  45.1 1.93e5      48 <NA>         <NA>       <NA>      
 8 GCA_959028115.1 Paraba…         99.4          0.51     4709637  45.1 1.35e5      54 <NA>         <NA>       <NA>      
 9 GCA_958416885.1 Paraba…         99.4          1.32     4704055  45.0 1.19e5      58 <NA>         <NA>       <NA>      
10 GCA_958404515.1 Paraba…         99.4          0.93     5100500  44.9 1.25e5      61 <NA>         <NA>       <NA>      
11 GCA_030843875.1 Paraba…         96.0          2.28     4410710  45.2 1.27e4     475 <NA>         <NA>       <NA>      
12 GCA_009774835.1 Paraba…         99.2          0.89     4866783  45.1 9.64e4      71 <NA>         <NA>       <NA>      
13 GCA_022784765.1 Paraba…         99.4          1.14     4816138  45.1 1.29e5      55 <NA>         <NA>       <NA>      
14 GCA_959023275.1 Paraba…         99.4          0.73     4832844  45.0 1.39e5      54 <NA>         <NA>       <NA>      
15 GCA_008679655.1 Paraba…         97.8          0.52     5036842  45.0 7.93e5      11 <NA>         <NA>       <NA>      
16 GCF_003474765.1 Paraba…         99.4          1.07     5043355  45.1 1.75e5      83 <NA>         <NA>       <NA>      
17 GCF_003459965.1 Paraba…         99.4          0.83     5151392  44.9 2.28e5      59 <NA>         <NA>       <NA>      
18 GCF_029369685.1 Paraba…         99.4          1.46     5493583  45.1 5.39e6       2 <NA>         <NA>       <NA>      
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
#   country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
#   mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
cluster3_MAGs <- mags_by_cluster[[3]]
genome_metadata %>% filter(ID %in% cluster3_MAGs)
# A tibble: 44 × 25
   ID              species completeness contamination genome_size    GC    N50 contigs host_species host_order host_class
   <chr>           <chr>          <dbl>         <dbl>       <dbl> <dbl>  <dbl>   <dbl> <chr>        <chr>      <chr>     
 1 EHM016991       s__Par…         99.0          0.38     4748219  44.9 1.17e5      76 Podarcis pi… Squamata   Reptilia  
 2 EHM023585       s__Par…         92.8          2.18     3889519  45.6 8.65e3     607 Sciurus vul… Rodentia   Mammalia  
 3 EHM027637       s__Par…         93.5          2.05     4160022  45.3 1.28e4     526 Cathartes a… Accipitri… Aves      
 4 EHM031743       s__Par…         93.5          1.99     4260825  45   2.13e4     296 Canis famil… Carnivora  Mammalia  
 5 EHM046460       s__Par…         99.5          1.66     4335884  45   3.87e4     181 Cathartes a… Accipitri… Aves      
 6 EHM047216       s__Par…         99.9          1.04     4517903  45   6.39e4     124 Lepus europ… Lagomorpha Mammalia  
 7 EHM064868       s__Par…         93.9          0.57     4141119  45   1.32e4     435 Lepus europ… Lagomorpha Mammalia  
 8 EHM067538       s__Par…         93.1          1.71     4147480  46   1.27e4     485 Lepus europ… Lagomorpha Mammalia  
 9 GCA_025757725.1 Paraba…         99.4          0.99     4848511  45.1 4.85e6       1 <NA>         <NA>       <NA>      
10 GCA_008679285.1 Paraba…         95.8          1.54     4711604  45.1 3.08e5      21 <NA>         <NA>       <NA>      
# ℹ 34 more rows
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
#   country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
#   mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
cluster4_MAGs <- mags_by_cluster[[4]]
genome_metadata %>% filter(ID %in% cluster4_MAGs)
# A tibble: 17 × 25
   ID              species completeness contamination genome_size    GC    N50 contigs host_species host_order host_class
   <chr>           <chr>          <dbl>         <dbl>       <dbl> <dbl>  <dbl>   <dbl> <chr>        <chr>      <chr>     
 1 EHM016318       s__Par…         93.6          1.81     4085923  45.3 7.43e3     685 Strix aluco  Strigifor… Aves      
 2 EHM023781       s__Par…         93.5          1.92     4159834  45.6 1.68e4     369 Podarcis ga… Squamata   Reptilia  
 3 EHM028668       s__Par…        100.0          0.87     4684324  45   7.38e4     107 Timon lepid… Squamata   Reptilia  
 4 EHM029564       s__Par…         95.9          1.5      4476906  45   4.19e4     152 Timon lepid… Squamata   Reptilia  
 5 EHM048790       s__Par…        100.0          2.23     4479865  45   5.73e4     130 Lepus europ… Lagomorpha Mammalia  
 6 EHM049533       s__Par…        100.0          0.65     5203783  45   1.24e5      64 Podarcis fi… Squamata   Reptilia  
 7 EHM072417       s__Par…         93.8          0.7      4365524  45   4.85e4     145 Timon lepid… Squamata   Reptilia  
 8 GCA_959026225.1 Paraba…         99.4          0.54     4672770  45.1 9.15e4      77 <NA>         <NA>       <NA>      
 9 GCF_018587995.1 Paraba…         99.4          0.27     5066235  45.0 3.04e5      80 <NA>         <NA>       <NA>      
10 GCF_018784125.1 Paraba…         99.4          1.84     4988964  45.1 1.72e5     112 <NA>         <NA>       <NA>      
11 GCF_018289335.1 Paraba…         99.4          0.71     5311305  45.2 5.20e6       3 <NA>         <NA>       <NA>      
12 GCF_022835335.1 Paraba…         93.6          1.43     4806077  45.0 2.76e6      26 <NA>         <NA>       <NA>      
13 GCF_022835355.1 Paraba…         93.6          1.46     4803375  45.0 2.76e6      29 <NA>         <NA>       <NA>      
14 GCF_003439415.1 Paraba…         99.4          1.02     5294086  45.1 2.14e5      81 <NA>         <NA>       <NA>      
15 GCF_019733675.1 Paraba…         99.4          1.49     5285880  45.2 3.21e5      33 <NA>         <NA>       <NA>      
16 GCF_003475725.1 Paraba…         99.4          1.84     5176159  45.0 1.12e5     111 <NA>         <NA>       <NA>      
17 GCF_003464475.1 Paraba…         99.4          1.9      5187286  44.9 2.09e5      72 <NA>         <NA>       <NA>      
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
#   country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
#   mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
metadata_with_cluster <- genome_metadata %>%
  left_join(coords %>% dplyr::select(MAG, cluster), by = c("ID" = "MAG"))

metadata_with_cluster %>%
  group_by(cluster, source) %>%
  summarise(n = n(), .groups = "drop") %>%
  arrange(cluster, desc(n))
# A tibble: 8 × 3
  cluster source     n
  <fct>   <chr>  <int>
1 1       GTDB       8
2 1       EHI        6
3 2       GTDB      14
4 2       EHI        4
5 3       GTDB      36
6 3       EHI        8
7 4       GTDB      10
8 4       EHI        7
# Group summaries
metadata_with_cluster %>%
  group_by(cluster) %>%
  summarise(mean_genome_size = mean(genome_size, na.rm = TRUE),
            median_contamination = median(contamination, na.rm = TRUE),
            .groups = "drop")
# A tibble: 4 × 3
  cluster mean_genome_size median_contamination
  <fct>              <dbl>                <dbl>
1 1               4417206.                1.26 
2 2               4825680.                0.905
3 3               4708235.                1.04 
4 4               4826370.                1.46 
save(ehi_mags,
     phylum_colors,
     genome_annotations,
     genome_gifts,
     contig_to_genome,
     gtdb_metadata,
     ehi_metadata,
     master_index,
     genome_metadata,
     source_colors,
     getphylo_tree,
     metadata_with_cluster,
     file = "data/data.Rdata")

Is the genome size different between clusters?

kruskal.test(genome_size ~ cluster, data = metadata_with_cluster)

    Kruskal-Wallis rank sum test

data:  genome_size by cluster
Kruskal-Wallis chi-squared = 8.0272, df = 3, p-value = 0.04545
pairwise.wilcox.test(
  x = metadata_with_cluster$genome_size,
  g = metadata_with_cluster$cluster,
  p.adjust.method = "BH"   # FDR correction
)

    Pairwise comparisons using Wilcoxon rank sum exact test 

data:  metadata_with_cluster$genome_size and metadata_with_cluster$cluster 

  1     2     3    
2 0.032 -     -    
3 0.165 0.288 -    
4 0.078 1.000 0.382

P value adjustment method: BH 

Plots

ggplot(metadata_with_cluster, aes(x = genome_size, y = GC, col = cluster))+
  geom_point()+
  theme_bw()
Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_point()`).

ggplot(metadata_with_cluster, aes(x = assembly_type, y = source, col = cluster))+
  geom_point()+
  theme_bw()

ggplot(metadata_with_cluster, aes(x = cluster, y = genome_size, col = cluster))+
  geom_point()+
  theme_bw()

#Aggregate bundle-level GIFTs into the compound level
GIFTs_elements <- to.elements(genome_gifts,GIFT_db)

#Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements,GIFT_db)

#Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions,GIFT_db)
GIFTs_elements %>%
    as.data.frame() %>%
    rownames_to_column(var="MAG")%>%
    pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
    left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
    mutate(functionid = substr(trait, 1, 3)) %>%
    mutate(trait = case_when(
      trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
      TRUE ~ trait
    )) %>%
    mutate(functionid = case_when(
      functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
      TRUE ~ functionid
    )) %>%
    mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
    mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
    ggplot(aes(x=MAG,y=trait,fill=gift)) +
        geom_tile(colour="white", linewidth=0.2)+
        scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
        facet_grid(functionid ~ cluster, scales="free",space="free") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              axis.text.y = element_text(size=6),
              strip.text.y = element_text(angle = 0)
              ) +
        labs(y="Traits",x="Samples",fill="GIFT")

GIFTs_elements %>%
    as.data.frame() %>%
    rownames_to_column(var="MAG")%>%
    pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
    left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
    mutate(functionid = substr(trait, 1, 3)) %>%
    mutate(trait = case_when(
      trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
      TRUE ~ trait
    )) %>%
    mutate(functionid = case_when(
      functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
      TRUE ~ functionid
    )) %>%
    mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
    mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
    ggplot(aes(x=MAG,y=trait,fill=gift)) +
        geom_tile(colour="white", linewidth=0.2)+
        scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
        facet_grid(functionid ~ source, scales="free",space="free") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
              axis.text.y = element_text(size=6),
              strip.text.y = element_text(angle = 0)
              ) +
        labs(y="Traits",x="Samples",fill="GIFT")

library(RColorBrewer)
library(reshape2)
Warning: package 'reshape2' was built under R version 4.4.3

Adjuntando el paquete: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
GIFTs_elements %>%
  reshape2::melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db,by="Code_element") %>%
  ggplot(., aes(x=Code_element, y=Genome, fill=GIFT, group=Code_function))+
    geom_tile()+
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE))+
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE))+
    scale_fill_gradientn(limits = c(0,1), colours=brewer.pal(7, "YlGnBu"))+
    facet_grid(. ~ Code_function, scales = "free", space = "free")+
    theme_grey(base_size=8)+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),strip.text.x = element_text(angle = 90))
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

head(GIFT_db)
  Code_bundle Code_element Code_function       Domain                  Function             Element
1     B010101        B0101           B01 Biosynthesis Nucleic acid biosynthesis Inosinic acid (IMP)
2     B010201        B0102           B01 Biosynthesis Nucleic acid biosynthesis Uridylic acid (UMP)
3     B010301        B0103           B01 Biosynthesis Nucleic acid biosynthesis             UDP/UTP
4     B010401        B0104           B01 Biosynthesis Nucleic acid biosynthesis             CDP/CTP
5     B010501        B0105           B01 Biosynthesis Nucleic acid biosynthesis             ADP/ATP
6     B010601        B0106           B01 Biosynthesis Nucleic acid biosynthesis             GDP/GTP
                                                                                                                                                                                                                                          Definition
1 K00764 (K01945,K11787,K11788,K13713) (K00601,K11175,K08289,K11787,K01492) (K01952,(K23269+K23264+K23265),(K23270+K23265)) (K01933,K11787,(K11788 (K01587,K11808,(K01589 K01588)))) (K01923,K01587,K13713) K01756 (K00602,(K01492,(K06863 K11176)))
2                                                                                                              (K11540,((K11541 K01465),((K01954,(K01955+K01956)) ((K00609+K00610),K00608) K01465))) (K00226,K00254,K17828) (K13421,(K00762 K01591))
3                                                                                                                                                                                                                             (K13800,K13809,K09903)
4                                                                                                                                                                                                                             (K00940,K18533) K01937
5                                                                                                                                                                                                 K01939 K01756 (K00939,K18532,K18533,K00944) K00940
6                                                                                                                                                                                                               K00088 K01951 K00942 (K00940,K18533)
GIFT_db%>%
  filter(Code_function == "D09")
   Code_bundle Code_element Code_function      Domain               Function         Element
1      D090101        D0901           D09 Degradation Antibiotic degradation      Penicillin
2      D090201        D0902           D09 Degradation Antibiotic degradation      Carbapenem
3      D090301        D0903           D09 Degradation Antibiotic degradation   Cephalosporin
4      D090401        D0904           D09 Degradation Antibiotic degradation       Oxacillin
5      D090501        D0905           D09 Degradation Antibiotic degradation   Streptogramin
6      D090601        D0906           D09 Degradation Antibiotic degradation      Fosfomycin
7      D090701        D0907           D09 Degradation Antibiotic degradation    Tetracycline
8      D090801        D0908           D09 Degradation Antibiotic degradation       Macrolide
9      D091001        D0910           D09 Degradation Antibiotic degradation Chloramphenicol
10     D091101        D0911           D09 Degradation Antibiotic degradation     Lincosamide
11     D091201        D0912           D09 Degradation Antibiotic degradation  Streptothricin
                                                                                                                                                                                                                 Definition
1                                                                                                           K18698,K18699,K18796,K18767,K18797,K19097,K19317,K18768,K18970,K19316,K22346,K18795,K19218,K19217,K17836,K18766
2                                                                                                                                                                                 K17837,K18782,K18781,K18780,K19099,K19216
3                                                                                                                                                            K19095,K19096,K19100,K19101,K19214,K19215,K20319,K20320,K01467
4  K17838,K18790,K18791,K19098,K18792,K19213,K21276,K18793,K18971,K22352,K19209,K18976,K18973,K18794,K18972,K21277,K19210,K19211,K19212,K22335,K19319,K22331,K22351,K19320,K19318,K19321,K19322,K21266,K22334,K22333,K22332
5                                                                                                                                                                                                             K19349,K19350
6                                                                                                                                                                                                                    K21252
7                                                                                                                                                                                 K08151,K08168,K18214,K18218,K18220,K18221
8                                                                                                                                                                                        K06979,K08217,K18230,K18231,K21251
9                                                                                                                                                                                 K00638,K08160,K18552,K18553,K18554,K19271
10                                                                                                                                                                                              K18236,K19349,K19350,K19545
11                                                                                                                                                                                                            K19273,K20816
library(reshape2)
library(RColorBrewer)

GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%            # Map Code_element -> Element & Function
  filter(str_starts(Code_element, "D09")) %>%            # Only antibiotic degradation
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  # Map Code_element IDs to trait names
  mutate(trait = case_when(
    Code_element %in% GIFT_db$Code_element ~ GIFT_db$Element[match(Code_element, GIFT_db$Code_element)],
    TRUE ~ Code_element
  )) %>%
  # Map function IDs for faceting if desired
  mutate(functionid = case_when(
    substr(Code_element,1,3) %in% GIFT_db$Code_function ~ GIFT_db$Function[match(substr(Code_element,1,3), GIFT_db$Code_function)],
    TRUE ~ substr(Code_element,1,3)
  )) %>%
  mutate(trait = factor(trait, levels = unique(GIFT_db$Element))) %>%
  ggplot(aes(x = trait, y = Genome, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = brewer.pal(7, "YlGnBu")) +
    facet_wrap(~ cluster, scales = "free_y", ncol = 1) +  
    theme_grey(base_size = 8) +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      strip.text.x = element_text(angle = 0)
    ) +
    labs(x = "Trait", fill = "GIFT")
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(str_starts(Code_element, "D09")) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  # Ensure trait is a factor to maintain order
  mutate(trait = factor(trait, levels = rev(unique(GIFT_db$Element)))) %>%
  ggplot( aes(x = Genome, y = trait)) +
  # Heatmap
  geom_tile(aes(fill = GIFT), color = "white") +
  scale_fill_gradientn(colours = brewer.pal(7, "YlGnBu"), name = "GIFT Score") +
  
  # Start a new fill scale for the Source bar
  new_scale_fill() +
  
  # Add the source bar at the very top or bottom
  geom_tile(aes(y = -0.5, fill = source), height = 0.5) +
  scale_fill_manual(values = source_colors) +
  
  facet_grid(~ cluster, scales = "free_x", space = "free_x") +
  theme_minimal(base_size = 8) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

9.3.2 Checking difference in GIFTs

9.3.2.1 Cluster-wise

Element GIFTs different between clusters of pcoa

# Get only the GIFT columns
gift_cols <- colnames(GIFTs_elements)[!(colnames(GIFTs_elements) %in% c("MAG_ID","cluster","source"))]

gift_dataframe <- GIFTs_elements %>%
  as.data.frame() %>%            
  rownames_to_column(var = "ID") 

gift_df_meta <- gift_dataframe %>%
  left_join(metadata_with_cluster, by = "ID")


# Kruskal-Wallis for 4 groups
kruskal_results <- sapply(gift_cols, function(g) {
  kruskal.test(as.formula(paste(g, "~ cluster")), data = gift_df_meta)$p.value
})

# Adjust for multiple testing
kruskal_results_adj <- p.adjust(kruskal_results, method = "BH")

# Combine into a table
kruskal_table <- data.frame(
  GIFT = gift_cols,
  p_value = kruskal_results,
  p_adj = kruskal_results_adj
)

kruskal_table %>% filter(p_adj<0.05)
       GIFT      p_value        p_adj
B0216 B0216 8.964986e-05 1.255098e-03
B0705 B0705 2.448055e-05 4.569703e-04
D0901 D0901 8.145800e-20 2.280824e-18
D0908 D0908 8.145800e-20 2.280824e-18
pairwise_results <- lapply(gift_cols, function(g) {
  pairwise.wilcox.test(
    x = gift_df_meta[[g]],
    g = gift_df_meta$cluster,
    p.adjust.method = "BH"
  )
})

names(pairwise_results) <- gift_cols

pairwise_sig_table <- lapply(names(pairwise_results), function(g) {
  
  # Extract p-value matrix
  pmat <- pairwise_results[[g]]$p.value
  
  # Convert to long format
  pmat_long <- as.data.frame(as.table(pmat)) %>%
    filter(!is.na(Freq)) %>%       # remove NA (diagonal / upper triangle)
    dplyr::rename(group1 = Var1, group2 = Var2, p_adj = Freq) %>%
    filter(p_adj < 0.05) %>%      
    mutate(GIFT = g)
  
  return(pmat_long)
})



# Combine all GIFTs into one table
pairwise_sig_table <- bind_rows(pairwise_sig_table) %>%
  dplyr::select(GIFT, group1, group2, p_adj)

pairwise_sig_table 
    GIFT group1 group2        p_adj
1  B0105      3      1 3.722745e-02
2  B0216      4      1 1.895970e-02
3  B0216      3      2 1.740565e-03
4  B0216      4      2 3.161198e-04
5  B0705      3      1 4.396047e-03
6  B0705      3      2 2.247836e-05
7  B0705      4      2 9.897753e-03
8  B0706      3      1 3.722745e-02
9  D0901      2      1 3.904407e-08
10 D0901      4      1 4.919048e-08
11 D0901      3      2 2.058151e-14
12 D0901      4      3 2.058151e-14
13 D0908      3      1 9.575590e-14
14 D0908      4      1 4.919048e-08
15 D0908      3      2 2.468680e-14
16 D0908      4      2 8.235871e-09
GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(Code_element %in% pairwise_sig_table$GIFT) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  distinct(Genome, Code_element, .keep_all = TRUE) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
  new_scale_fill() +
  
  # Add the source bar at the very top or bottom
  geom_tile(aes(y = -0.5, fill = source), height = 0.3) +
  scale_fill_manual(values = source_colors) +
    facet_grid(~ cluster, scales = "free_x", space = "free_x") + 
    theme(
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 4),
      strip.text.x = element_text(face = "bold")
    ) +
    labs(y = "Trait", x = "Genome", fill = "GIFT")
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

GIFT_db%>%
  filter(Code_element %in%  pairwise_sig_table$GIFT)
   Code_bundle Code_element Code_function       Domain                  Function          Element
1      B010501        B0105           B01 Biosynthesis Nucleic acid biosynthesis          ADP/ATP
2      B021601        B0216           B02 Biosynthesis   Amino acid biosynthesis       Tryptophan
3      B070501        B0705           B07 Biosynthesis      Vitamin biosynthesis Pyridoxal-P (B6)
4      B070502        B0705           B07 Biosynthesis      Vitamin biosynthesis Pyridoxal-P (B6)
5      B070601        B0706           B07 Biosynthesis      Vitamin biosynthesis      Biotin (B7)
6      B070602        B0706           B07 Biosynthesis      Vitamin biosynthesis      Biotin (B7)
7      B070603        B0706           B07 Biosynthesis      Vitamin biosynthesis      Biotin (B7)
8      B070604        B0706           B07 Biosynthesis      Vitamin biosynthesis      Biotin (B7)
9      D090101        D0901           D09  Degradation    Antibiotic degradation       Penicillin
10     D090801        D0908           D09  Degradation    Antibiotic degradation        Macrolide
                                                                                                                                     Definition
1                                                                                            K01939 K01756 (K00939,K18532,K18533,K00944) K00940
2  ((((K01657+K01658),K13503,K13501,K01656) K00766),K13497) (((K01817,K24017) (K01656,K01609)),K13498,K13501) ((K01695+(K01696,K06001)),K01694)
3                                                                                                     K03472 K03473 K00831 K00097 K03474 K00275
4                                                                                                                                 K06215 K08681
5                                                                                               K00652 (((K00833,K19563) K01935),K19562) K01012
6                                                                                                                   K00652 K25570 K01935 K01012
7                                                                                                            K16593 K00652 K19563 K01935 K01012
8                                                                                                   K01906 K00652 (K00833,K19563) K01935 K01012
9                               K18698,K18699,K18796,K18767,K18797,K19097,K19317,K18768,K18970,K19316,K22346,K18795,K19218,K19217,K17836,K18766
10                                                                                                           K06979,K08217,K18230,K18231,K21251
library(rstatix)
library(dplyr)
library(purrr)

pairwise_sig_table <- map_df(gift_cols, function(g) {
  
  # Check if there is variation in the GIFT values
  # If all values are the same, skip this GIFT
  if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
  
  # Run the test safely
  tryCatch({
    results <- gift_df_meta %>%
      wilcox_test(as.formula(paste0("`", g, "` ~ cluster")), p.adjust.method = "BH")
    
    eff <- gift_df_meta %>%
      wilcox_effsize(as.formula(paste0("`", g, "` ~ cluster")))
    
    results %>%
      left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
      filter(p.adj < 0.05) %>%
      mutate(GIFT = g)
  }, error = function(e) return(NULL)) # If it still fails, just skip it
})

top_gifts <- pairwise_sig_table %>%
  arrange(desc(effsize))


top_gifts
# A tibble: 6 × 12
  .y.   group1 group2    n1    n2 statistic          p     p.adj p.adj.signif effsize magnitude GIFT 
  <chr> <chr>  <chr>  <int> <int>     <dbl>      <dbl>     <dbl> <chr>          <dbl> <ord>     <chr>
1 B0216 2      4         18    17      258. 0.0000527  0.000316  ***            0.687 large     B0216
2 B0705 2      3         18    44      151  0.00000375 0.0000225 ****           0.589 large     B0705
3 B0705 2      4         18    17       79  0.005      0.01      **             0.478 moderate  B0705
4 B0216 1      4         14    17      179  0.009      0.019     *              0.470 moderate  B0216
5 B0216 2      3         18    44      583  0.00058    0.002     **             0.438 moderate  B0216
6 B0705 1      3         14    44      174  0.001      0.004     **             0.419 moderate  B0705
library(reshape2)
library(RColorBrewer)

GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  # Only keep significant traits
  filter(Code_element %in% pairwise_sig_table$GIFT) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  distinct(Genome, Code_element, .keep_all = TRUE) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%          # human-readable trait
  ggplot(aes(x = trait, y = Genome, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
    facet_wrap(~ cluster, scales = "free_y", ncol = 1) +
    theme_grey(base_size = 8) +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      strip.text.x = element_text(angle = 0)
    ) +
    labs(x = "Trait", fill = "GIFT")

GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(Code_element %in% pairwise_sig_table$GIFT) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  distinct(Genome, Code_element, .keep_all = TRUE) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
    facet_grid(~ cluster, scales = "free_x", space = "free_x") + 
    theme_grey(base_size = 8) +
    theme(
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      strip.text.x = element_text(face = "bold")
    ) +
    labs(y = "Trait", x = "Genome", fill = "GIFT")
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.

9.3.2.2 Source-wise

GIFTs that are different between EHI and GTDB

wilcox_results <- sapply(gift_cols, function(g) {
  wilcox.test(as.formula(paste(g, "~ source")), data = gift_df_meta)$p.value
})

wilcox_results_adj <- p.adjust(wilcox_results, method = "BH")

data.frame(
  GIFT = gift_cols,
  p_value = wilcox_results,
  p_adj = wilcox_results_adj
)
       GIFT      p_value      p_adj
B0101 B0101 0.0056603987 0.07924558
B0102 B0102 0.5605090914 0.58126869
B0103 B0103          NaN        NaN
B0104 B0104 0.5605090914 0.58126869
B0105 B0105 0.4795942525 0.56798830
B0106 B0106 0.2222308172 0.41483086
B0204 B0204 0.5605090914 0.58126869
B0205 B0205 0.3981400285 0.55739604
B0206 B0206 0.1041649692 0.22435532
B0207 B0207 0.2946362754 0.47167436
B0208 B0208          NaN        NaN
B0209 B0209 0.4689178506 0.56798830
B0210 B0210 0.1041649692 0.22435532
B0211 B0211 0.4689178506 0.56798830
B0212 B0212 0.0199165012 0.11157331
B0213 B0213          NaN        NaN
B0214 B0214 0.5605090914 0.58126869
B0215 B0215 0.0008357308 0.02341747
B0216 B0216 0.4388035480 0.56798830
B0217 B0217 0.1041649692 0.22435532
B0218 B0218          NaN        NaN
B0219 B0219 0.0631194694 0.22435532
B0220 B0220 0.1166255893 0.23890272
B0221 B0221 0.3054108219 0.47508350
B0302 B0302          NaN        NaN
B0303 B0303 0.1041649692 0.22435532
B0307 B0307 0.0024550065 0.04582679
B0309 B0309 0.1041649692 0.22435532
B0310 B0310          NaN        NaN
B0401 B0401          NaN        NaN
B0402 B0402          NaN        NaN
B0403 B0403          NaN        NaN
B0601 B0601 0.1041649692 0.22435532
B0602 B0602 0.1041649692 0.22435532
B0603 B0603 0.1194513584 0.23890272
B0604 B0604          NaN        NaN
B0605 B0605          NaN        NaN
B0701 B0701          NaN        NaN
B0702 B0702 0.3480882112 0.52683621
B0703 B0703          NaN        NaN
B0704 B0704 0.3981400285 0.55739604
B0705 B0705 0.2602709220 0.47016683
B0706 B0706 0.0199238062 0.11157331
B0707 B0707 0.0199238062 0.11157331
B0708 B0708 0.9410448183 0.94104482
B0709 B0709 0.1041649692 0.22435532
B0710 B0710 0.0220561153 0.11228568
B0711 B0711 0.0101037961 0.11157331
B0712 B0712 0.1041649692 0.22435532
B0801 B0801          NaN        NaN
B0802 B0802          NaN        NaN
B0803 B0803          NaN        NaN
B0804 B0804          NaN        NaN
B0805 B0805          NaN        NaN
B0901 B0901          NaN        NaN
B0902 B0902          NaN        NaN
B0903 B0903          NaN        NaN
B1004 B1004          NaN        NaN
B1006 B1006          NaN        NaN
B1008 B1008          NaN        NaN
B1011 B1011          NaN        NaN
B1012 B1012          NaN        NaN
B1014 B1014          NaN        NaN
B1021 B1021          NaN        NaN
B1022 B1022          NaN        NaN
B1024 B1024          NaN        NaN
B1026 B1026 0.0345084745 0.16103955
B1028 B1028          NaN        NaN
B1029 B1029          NaN        NaN
B1041 B1041          NaN        NaN
B1042 B1042          NaN        NaN
D0101 D0101 0.0199165012 0.11157331
D0102 D0102          NaN        NaN
D0103 D0103          NaN        NaN
D0104 D0104          NaN        NaN
D0201 D0201          NaN        NaN
D0202 D0202          NaN        NaN
D0203 D0203          NaN        NaN
D0204 D0204          NaN        NaN
D0205 D0205          NaN        NaN
D0206 D0206          NaN        NaN
D0207 D0207          NaN        NaN
D0208 D0208          NaN        NaN
D0209 D0209          NaN        NaN
D0210 D0210          NaN        NaN
D0211 D0211          NaN        NaN
D0212 D0212          NaN        NaN
D0213 D0213          NaN        NaN
D0301 D0301          NaN        NaN
D0302 D0302          NaN        NaN
D0303 D0303          NaN        NaN
D0304 D0304          NaN        NaN
D0305 D0305          NaN        NaN
D0306 D0306          NaN        NaN
D0307 D0307          NaN        NaN
D0308 D0308          NaN        NaN
D0309 D0309          NaN        NaN
D0310 D0310          NaN        NaN
D0401 D0401          NaN        NaN
D0402 D0402          NaN        NaN
D0403 D0403          NaN        NaN
D0404 D0404          NaN        NaN
D0405 D0405          NaN        NaN
D0406 D0406          NaN        NaN
D0407 D0407          NaN        NaN
D0408 D0408          NaN        NaN
D0501 D0501          NaN        NaN
D0502 D0502          NaN        NaN
D0503 D0503          NaN        NaN
D0504 D0504          NaN        NaN
D0505 D0505          NaN        NaN
D0506 D0506          NaN        NaN
D0507 D0507 0.0631194694 0.22435532
D0508 D0508          NaN        NaN
D0509 D0509 0.1041649692 0.22435532
D0510 D0510          NaN        NaN
D0511 D0511          NaN        NaN
D0512 D0512 0.5605090914 0.58126869
D0513 D0513 0.2946362754 0.47167436
D0516 D0516          NaN        NaN
D0517 D0517          NaN        NaN
D0518 D0518          NaN        NaN
D0601 D0601 0.0008363381 0.02341747
D0602 D0602          NaN        NaN
D0603 D0603          NaN        NaN
D0604 D0604          NaN        NaN
D0606 D0606          NaN        NaN
D0607 D0607          NaN        NaN
D0608 D0608          NaN        NaN
D0609 D0609          NaN        NaN
D0610 D0610          NaN        NaN
D0611 D0611          NaN        NaN
D0612 D0612          NaN        NaN
D0613 D0613          NaN        NaN
D0701 D0701          NaN        NaN
D0702 D0702          NaN        NaN
D0704 D0704          NaN        NaN
D0705 D0705          NaN        NaN
D0706 D0706          NaN        NaN
D0708 D0708          NaN        NaN
D0709 D0709          NaN        NaN
D0801 D0801          NaN        NaN
D0802 D0802          NaN        NaN
D0804 D0804          NaN        NaN
D0805 D0805          NaN        NaN
D0806 D0806          NaN        NaN
D0807 D0807 0.8098046490 0.82452837
D0808 D0808          NaN        NaN
D0809 D0809          NaN        NaN
D0810 D0810          NaN        NaN
D0811 D0811          NaN        NaN
D0812 D0812          NaN        NaN
D0813 D0813          NaN        NaN
D0814 D0814          NaN        NaN
D0815 D0815          NaN        NaN
D0816 D0816          NaN        NaN
D0817 D0817          NaN        NaN
D0818 D0818          NaN        NaN
D0819 D0819          NaN        NaN
D0901 D0901 0.4478557006 0.56798830
D0902 D0902 0.4689178506 0.56798830
D0903 D0903          NaN        NaN
D0904 D0904 0.3981400285 0.55739604
D0905 D0905          NaN        NaN
D0906 D0906          NaN        NaN
D0907 D0907 0.4689178506 0.56798830
D0908 D0908 0.4969897664 0.56798830
D0910 D0910 0.0199165012 0.11157331
D0911 D0911 0.2947964749 0.47167436
D0912 D0912 0.1041649692 0.22435532
S0101 S0101          NaN        NaN
S0103 S0103          NaN        NaN
S0104 S0104          NaN        NaN
S0105 S0105 0.2947964749 0.47167436
S0201 S0201 0.0707191711 0.22435532
S0202 S0202 0.4954727815 0.56798830
S0301 S0301 0.1247781764 0.24095096
effect_size <- sapply(gift_cols, function(g) {
  median(gift_df_meta[[g]][gift_df_meta$source=="GTDB"]) -
    median(gift_df_meta[[g]][gift_df_meta$source=="EHI"])
})

wilcox_res_source <- data.frame(
  GIFT = gift_cols,
  p_value = wilcox_results,
  p_adj = wilcox_results_adj,
  effect = effect_size
)


wilcox_res_source %>% dplyr::select(GIFT, p_adj, effect) %>% filter(p_adj<0.05)
       GIFT      p_adj effect
B0215 B0215 0.02341747      0
B0307 B0307 0.04582679      0
D0601 D0601 0.02341747      0
GIFT_db%>%
  filter(Code_element %in% c("B0215", "B0307", "D0601") )
  Code_bundle Code_element Code_function       Domain                           Function    Element
1     B021501        B0215           B02 Biosynthesis            Amino acid biosynthesis  Histidine
2     B030701        B0307           B03 Biosynthesis Amino acid derivative biosynthesis Spermidine
3     D060101        D0601           D06  Degradation      Nitrogen compound degradation    Nitrate
4     D060102        D0601           D06  Degradation      Nitrogen compound degradation    Nitrate
5     D060103        D0601           D06  Degradation      Nitrogen compound degradation    Nitrate
6     D060105        D0601           D06  Degradation      Nitrogen compound degradation    Nitrate
                                                                                                                                                Definition
1 K00765 ((K01523 K01496),K11755,K14152) (K01814,K24017) ((K02501+K02500),K01663) ((K01693 K00817 (K04486,K05602,K18649)),(K01089 K00817)) (K00013,K14152)
2                                                                                                                     (K01583,K01584,K01585,K02626) K01480
3                                                                               ((K00370+K00371+K00374),(K02567+K02568)) ((K00362+K00363),(K03385+K15876))
4                                                                                                                          1.7.5.1 1.7.2.1 1.7.2.5 1.7.2.4
5                                                                                                                                          1.9.6.1 1.7.2.2
6                                                                                                                                          1.7.7.2 1.7.7.1
wilcox_res_source <- map_df(gift_cols, function(g) {
  
 
  if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
  
  tryCatch({
    # Calculate p-value
    res <- gift_df_meta %>%
      wilcox_test(as.formula(paste0("`", g, "` ~ source")))
    
    # Calculate effect size (Rank-Biserial Correlation)
    eff <- gift_df_meta %>%
      wilcox_effsize(as.formula(paste0("`", g, "` ~ source")))
    
    # Combine results
    res %>%
      left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
      mutate(GIFT = g)
  }, error = function(e) return(NULL))
})

#  Adjust P-values and filter
wilcox_res_source <- wilcox_res_source %>%
  mutate(p_adj = p.adjust(p, method = "BH")) %>%
  dplyr::select(GIFT, group1, group2, p_adj, effsize, magnitude) %>%
  filter(p_adj < 0.05) %>%
  arrange(desc(abs(effsize))) # Sort by strongest effect

# View the top results
print(wilcox_res_source)
# A tibble: 3 × 6
  GIFT  group1 group2  p_adj effsize magnitude
  <chr> <chr>  <chr>   <dbl>   <dbl> <ord>    
1 B0215 EHI    GTDB   0.0234   0.348 moderate 
2 D0601 EHI    GTDB   0.0234   0.348 moderate 
3 B0307 EHI    GTDB   0.0459   0.315 moderate 
GIFTs_elements %>%
  melt() %>%
  dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
  inner_join(GIFT_db, by = "Code_element") %>%
  filter(Code_element %in% wilcox_res_source$GIFT) %>%
  left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
  distinct(Genome, Code_element, .keep_all = TRUE) %>%
  mutate(cluster = factor(cluster)) %>%
  droplevels() %>%
  arrange(cluster, Genome) %>%
  mutate(trait = Element) %>%
  ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
    geom_tile(colour = "white", linewidth = 0.2) +
    scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
    scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
  new_scale_fill() +
  
  # Add the source bar at the very top or bottom
  geom_tile(aes(y = -0.5, fill = source), height = 0.5) +
  scale_fill_manual(values = source_colors) +
    facet_grid(~ source, scales = "free_x", space = "free_x") + 
    theme(
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      strip.text.x = element_text(face = "bold")
    ) +
    labs(y = "Trait", x = "Genome", fill = "GIFT")
Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.